library(readxl)
library(writexl)
library(dplyr)
library(zoo) #dates
library(tidyr)
library(fastLink) #gtfsr matching
library(stringr)
library(fastLink)
library(ggplot2)

#set working directory to correct folder
setwd("~/Box Sync/Index Blueprint2/Smart Cities Article/Replication Materials/GTREP/") 

#GTFS-s----
gtfs = read_excel("match_data.xlsx")
gtfs$AgencyName <-gtfs$`Agency Name`
gtfs$yrmo = as.yearmon(gtfs$date, "%m/%Y")
gtfs$NTDID <- gtfs$`NTD ID`
gtfs$`Agency Name`=NULL
gtfs$`NTD ID`=NULL

#Cumulative indicator for adoption
gtfs = gtfs %>% group_by(AgencyName) %>% mutate(updates_sum = cumsum(updates))

#extract year from first four chars, save as numeric
gtfs$year = as.numeric(substr(as.numeric(gtfs$yrmo), 1, 4)) 
#if the agency cumulatively has ever adopted, count as "adopt"
gtfs$adopt = ifelse(gtfs$updates_sum > 0, 1, 0)

#NTD Variables---- 
#agency characteristics (Name, organization type, VOMS, reporter type, location, population, service sq. miles)
NTD19=read_excel("2018 Agency Info.xlsx")
#types of transportation services provided
#determines which agencies are in the sampling frame based on mode
NT19=read_excel("2018 Agency Mode TOS.xlsx") 


#subset to the correct agencies: drop agencies with only demand responsive fleet
NT19=NT19 %>% filter(Mode != "DR" &  Mode != "VP" &  Mode != "DT")
NTD19$NTDID=NTD19$`NTD ID`
NTD19=NTD19 %>% filter(State=="CA")
NTD19=NTD19[NTD19$`NTD ID` %in% c(NT19$`NTD ID`),]
#simplify orgtype name
NTD19$orgtype = NTD19$`Organization Type`
NTD19$orgtype = ifelse(NTD19$orgtype == "MPO, COG or Other Planning Agency" | NTD19$orgtype == "Other Publicly-Owned or Privately Chartered Corporation" | NTD19$orgtype == "Private-Non-Profit Corporation"|
                         NTD19$orgtype == "Tribe" | is.na(NTD19$orgtype) |NTD19$orgtype == "University", "Other", NTD19$orgtype)
#Adjust names of orgtype for figures later
NTD19$orgtype[NTD19$orgtype == "City, County or Local Government Unit or Department of Transportation"] <- "Local Government"
NTD19$orgtype[NTD19$orgtype == "Independent Public Agency or Authority of Transit Service"] <- "Independent Public Agency"
NTD19 <- within(NTD19, orgtype <- relevel(as.factor(orgtype), ref = "Local Government"))
NTD19$AgencyName <- NTD19$`Agency Name` 
NTD19$reptype = NTD19$`Reporter Type` #change names so they are less unwieldy
NTD19$pop = NTD19$`Service Area Pop`
NTD19$VOMS = NTD19$`Total VOMS`
NTD19$mile = NTD19$`Service Area Sq Miles`
NTD19 = as.data.frame(NTD19)
NTD19 = NTD19 %>% select(NTDID, orgtype, pop, reptype, VOMS, City, mile, AgencyName, `Legacy NTD ID`) #select variables you want
NTD19$poplog = log(NTD19$pop) #log population
NTD19$milelog=log(NTD19$mile) #log service areas 
NTD19$poplog[NTD19$poplog == "-Inf"] = NA #for Infs (0) make NA
NTD19$milelog[NTD19$milelog == "-Inf"] = NA


##Principal City----

###2013----
prin = read_xls("list2_2013.xls")
names(prin) <- prin[2,] #rename variables 
prin <- prin[-c(1:2),] #delete first 2 rows
prin = subset(prin, prin$`FIPS State Code` == "06") #subset to only CA
prin$City = prin$`Principal City Name`
prin$PC13 <- rep("Principal City",  length(prin$`CBSA Code`))
prin <- prin %>% select(City, PC13)

###2015----
prin2 = read_xls("list2_2015.xls")
names(prin2) <- prin2[2,] #rename variables 
prin2 <- prin2[-c(1:2),] #delete first 2 rows
prin2 = subset(prin2, prin2$`FIPS State Code` == "06") #subset to only CA
prin2$City = prin2$`Principal City Name`
prin2$PC15 <- rep("Principal City",  length(prin2$`CBSA Code`))
prin2 <- prin2 %>% select(City, PC15)



###2018----
prin3 = read_xls("list2_Sep_2018.xls")
names(prin3) <- prin3[2,] #rename variables 
prin3 <- prin3[-c(1:2),] #delete first 2 rows
prin3 = subset(prin3, prin3$`FIPS State Code` == "06") #subset to only CA
prin3$City = prin3$`Principal City Name`
prin3$PC18 <- rep("Principal City",  length(prin3$`CBSA Code`))
prin3 <- prin3 %>% select(City, PC18)

###2020----
prin4 = read_xls("list2_2020.xls")
names(prin4) <- prin4[2,] #rename variables 
prin4 <- prin4[-c(1:2),] #delete first 2 rows
prin4 = subset(prin4, prin4$`FIPS State Code` == "06") #subset to only CA
prin4$City = prin4$`Principal City Name`
prin4$PC20 <- rep("Principal City",  length(prin4$`CBSA Code`))
prin4 <- prin4 %>% select(City, PC20)

#merge with NTD data
NTD19 = merge(NTD19, prin, by.x="City", by.y="City", all.x=TRUE, all.y=FALSE)
NTD19 = merge(NTD19, prin2, by.x="City", by.y="City", all.x=TRUE, all.y=FALSE)
NTD19 = merge(NTD19, prin3, by.x="City", by.y="City", all.x=TRUE, all.y=FALSE)
NTD19 = merge(NTD19, prin4, by.x="City", by.y="City", all.x=TRUE, all.y=FALSE)
NTD19$PC13 <- ifelse(is.na(NTD19$PC13), "Non-Principal City", NTD19$PC13)
NTD19$PC13 <- as.factor(NTD19$PC13)

NTD19$PC15 <- ifelse(is.na(NTD19$PC15), "Non-Principal City", NTD19$PC15)
NTD19$PC15 <- as.factor(NTD19$PC15)

NTD19$PC18 <- ifelse(is.na(NTD19$PC18), "Non-Principal City", NTD19$PC18)
NTD19$PC18 <- as.factor(NTD19$PC18)

NTD19$PC20 <- ifelse(is.na(NTD19$PC20), "Non-Principal City", NTD19$PC20)
NTD19$PC20 <- as.factor(NTD19$PC20)


##2018 revenue----
rev <- read_excel("2023 TS1.1 Total Funding Time Series.xlsx", sheet =3)
rev$NTDID <- rev$`NTD ID`
cols <- c(16:49)
rev <- rev[,cols]
rev2 <- pivot_longer(rev, cols = -c(NTDID), names_to = "year",
                     values_to = "revenue")
rev2 <- subset(rev2, rev2$year == "2018")
rev2$year = NULL
#set 0s to NA
rev2$revenue <- ifelse(rev2$revenue==0, NA, rev2$revenue)


#revenue sheet for some reason drops the 9RO2 prefix for agencies 
#if NTD ID ID is not in rev2, add 9R02 to the prefix of rev2 NTD ID
x <- setdiff(gtfs$NTDID, rev2$NTDID)
x <- sub("^9R02-", "", x) #remove to get the ids in rev2
rev2$NTDID <- ifelse(rev2$NTDID %in% x, paste0("9R02-", rev2$NTDID), rev2$NTDID)
gtfs <- merge(gtfs, rev2, by.x=c("NTDID") ,by.y=c("NTDID"), all.x=TRUE, all.y=FALSE)


#merge agency characteristics with gtfs updates
gtfs$AgencyName=NULL
gtfs=merge(gtfs, NTD19, by.x="NTDID", by.y="NTDID", all.x=FALSE, all.y=FALSE)
gtfsx = gtfs %>% filter(updates>0) %>% group_by(NTDID) %>% summarize(st_update=min(yrmo)) #create survival DF
write_xlsx(gtfs, "gtfs.xlsx")

#Survival Analysis---- 
#create DF with beginning of updates for adopters, merge with sampling frame
gtfsx$indicator = rep(1, length(gtfsx$NTDID))
gtfsx=merge(gtfsx, NTD19, by.x="NTDID", by.y="NTDID", all.x=FALSE, all.y=TRUE)
gtfsx$indicator[is.na(gtfsx$indicator)] = 0 
gtfsx$st[is.na(gtfsx$st)] = max(na.omit(gtfsx$st)) #max year for non adopters
gtfsx$time = gtfsx$st - min(gtfsx$st)#time to adoption
gtfsx$st_update <- NULL
gtfsx$st <- NULL #don't need, only includes adopters

##2013 NTD----

#add 2013 data, even though it will drop a LOT of covariates

NTD13=read_excel("2013 Agency Information_0.xlsx")
NTD13$poplog13 <- log(NTD13$`Service Area Population`)
NTD13$pop13 <- NTD13$`Service Area Population`
NTD13$milelog13 <-  log(NTD13$`Service Area (SQ Mi)`)
NTD13$mile13 <- NTD13$`Service Area (SQ Mi)`
NTD13 <- NTD13 %>% select(NTDID, poplog13,milelog13,  pop13, mile13)
NTD13 <- subset(NTD13, NTD13$NTDID %in% gtfsx$`Legacy NTD ID`)

gtfsx <- merge(gtfsx, NTD13, by.x="Legacy NTD ID", by.y="NTDID", all.x=TRUE, all.y=FALSE)


##2016 NTD----
NTD16=read_excel("2016 Agency Information.xlsx")
NTD16$NTDID <- NTD16$`5 Digit NTD ID`
NTD16$poplog16 <- log(NTD16$`Service Area Pop`)
NTD16$milelog16 <-  log(NTD16$`Service Area Sq Miles`)
NTD16<- NTD16 %>% select(NTDID, poplog16,milelog16)
NTD16 <- subset(NTD16, NTD16$NTDID %in% gtfsx$NTDID)
gtfsx <- merge(gtfsx, NTD16, by.x="NTDID", by.y="NTDID", all.x=TRUE, all.y=FALSE)


##2021 NTD----

#for gtfs-r analysis

NTD21=read_excel("2021 Agency Information.xlsx")
NTD21$NTDID <- NTD21$`NTD ID`
NTD21$poplog21 <- log(NTD21$`Service Area Pop`)
NTD21$milelog21 <-  log(NTD21$`Service Area Sq Miles`)
NTD21<- NTD21 %>% select(NTDID, poplog21,milelog21)
gtfsx <- merge(gtfsx, NTD21, by.x="NTDID", by.y="NTDID", all.x=TRUE, all.y=FALSE)


##2013 revenue----
rev <- read_excel("2023 TS1.1 Total Funding Time Series.xlsx", sheet =3)
rev$NTDID <- rev$`NTD ID`
cols <- c(16:49)
rev <- rev[,cols]
rev2 <- pivot_longer(rev, cols = -c(NTDID), names_to = "year",
                     values_to = "revenue13")
rev13 <- subset(rev2, rev2$year == "2013")  #2013 to be the start of the sampling frame 
rev13$year <- NULL
#many agencies report 0 revenue. Set these to NA so they are dropped
rev13$revenue13 <- ifelse(rev13$revenue13==0, NA, rev13$revenue13)
rev13$revlog13 <- log(rev13$revenue13)

# Identify rows in rev13$NTDID that are in x
x <- setdiff(gtfsx$NTDID, rev13$NTDID)
x <- sub("^9R02-", "", x) #remove to get the ids in rev2
rev13$NTDID <- ifelse(rev13$NTDID %in% x, paste0("9R02-", rev13$NTDID), rev13$NTDID)

gtfsx <- merge(gtfsx, rev13, by.x="NTDID", by.y="NTDID", all.x=TRUE, all.y=FALSE)

##2016 revenue----

rev <- read_excel("2023 TS1.1 Total Funding Time Series.xlsx", sheet =3)
rev$NTDID <- rev$`NTD ID`
cols <- c(16:49)
rev <- rev[,cols]
rev2 <- pivot_longer(rev, cols = -c(NTDID), names_to = "year",
                     values_to = "revenue16")
rev16 <- subset(rev2, rev2$year == "2016")  #2016 to be the start of the sampling frame 
rev16$year <- NULL
#set 0s to NA
rev16$revenue16 <- ifelse(rev16$revenue16==0, NA, rev16$revenue16)
rev16$revlog16 <- log(rev16$revenue16)

#wrangle reporting changes
x <- setdiff(gtfsx$NTDID, rev16$NTDID)
x <- sub("^9R02-", "", x) #remove to get the ids in rev2
rev16$NTDID <- ifelse(rev16$NTDID %in% x, paste0("9R02-", rev16$NTDID), rev16$NTDID)

gtfsx <- merge(gtfsx, rev16, by.x="NTDID", by.y="NTDID", all.x=TRUE, all.y=FALSE)

##2021 revenue----
#for GTFS-r

rev <- read_excel("2023 TS1.1 Total Funding Time Series.xlsx", sheet =3)
rev$NTDID <- rev$`NTD ID`
cols <- c(16:49)
rev <- rev[,cols]
rev2 <- pivot_longer(rev, cols = -c(NTDID), names_to = "year",
                     values_to = "revenue21")
rev21 <- subset(rev2, rev2$year == "2021")  #2021 to be the start of the sampling frame 
rev21$year <- NULL

#set 0s to NA
rev21$revenue21 <- ifelse(rev21$revenue21==0, NA, rev21$revenue21)
rev21$revlog21 <- log(rev21$revenue21)

x <- setdiff(gtfsx$NTDID, rev21$NTDID)
x <- sub("^9R02-", "", x) #remove to get the ids in rev2
rev21$NTDID <- ifelse(rev21$NTDID %in% x, paste0("9R02-", rev21$NTDID), rev21$NTDID)

gtfsx <- merge(gtfsx, rev21, by.x="NTDID", by.y="NTDID", all.x=TRUE, all.y=FALSE)


##GTFS-r----
sources <- read.csv("sources.csv")
sources=subset(sources, sources$location.subdivision_name == "California" & sources$data_type=="gtfs-rt")
sources$AgencyName = toupper(sources$provider)
sources$City=toupper(sources$location.municipality)
sources <- unique(sources %>% select(AgencyName, City))

gtfsx$AgencyName=toupper(gtfsx$AgencyName)
gtfsx$City = toupper(gtfsx$City)
s1 = fastLink(gtfsx, sources,
                   varnames = c("AgencyName"), stringdist.match = c("AgencyName"),cut.p=.70,
              partial.match = c("AgencyName"), return.df = TRUE,stringdist.method="jw",
                   n.cores = 1) 
mdf <- getMatches(gtfsx, sources, s1)
mdf$gtfsrt <- rep(1, length(mdf$AgencyName))
mdf<- mdf %>% select(AgencyName, gtfsrt)
gtfsx <- merge(gtfsx, mdf, by.x="AgencyName", by.y="AgencyName", all.x=TRUE, all.y=FALSE)
#manually add in no matches
nm <- subset(sources, !(sources$AgencyName %in% s1$dfB.match$AgencyName)) #in gtfsrt that did not match
#for long agency names use gtfsx$AgencyName[gtfsx$NTDID == 90173]
gtfsx$gtfsrt[gtfsx$AgencyName == "SAN FRANCISCO BAY AREA RAPID TRANSIT DISTRICT"] = 1
gtfsx$gtfsrt[gtfsx$AgencyName == "HUMBOLDT TRANSIT AUTHORITY"] = 1
gtfsx$gtfsrt[gtfsx$AgencyName == "TRANSIT JOINT POWERS AUTHORITY FOR MERCED COUNTY"] = 1
gtfsx$gtfsrt[gtfsx$AgencyName == "KERN REGIONAL TRANSIT"] = 1
gtfsx$gtfsrt[gtfsx$AgencyName == "STANISLAUS COUNTY DBA: STANISLAUS REGIONAL TRANSIT"] = 1
gtfsx$gtfsrt[gtfsx$AgencyName == "CITY OF PASADENA"] = 1
gtfsx$gtfsrt[gtfsx$AgencyName == "CITY OF TURLOCK"] = 1
gtfsx$gtfsrt[gtfsx$AgencyName == "NEVADA COUNTY TRANSIT SERVICES"] = 1
gtfsx$gtfsrt[gtfsx$AgencyName == "CITY OF AVALON"] = 1
gtfsx$gtfsrt[gtfsx$AgencyName == "MADERA COUNTY"] = 1
gtfsx$gtfsrt[gtfsx$AgencyName == "CITY OF COMMERCE"] = 1
gtfsx$gtfsrt <- ifelse( is.na(gtfsx$gtfsrt), 0, gtfsx$gtfsrt)

write_xlsx(gtfsx, "gtfsx.xlsx")


#Maintenance (2018-2021)----
gtfs <- gtfs[ , ! names(gtfs) %in% c("trips/updates",  
                                           "City", "County", "stops", "trips", "date")] 
gtfs_time = gtfs %>% filter(year > 2018) #sampling frame begins in 2019
gtfs_time =gtfs_time[!(gtfs_time$NTDID %in% c(unique(gtfs_time$NTDID[gtfs_time$updates_sum == 0]))),] #drop adopters after 2018
#drop agencies that did not update at all during the pandemic
gtfs_time <- gtfs_time %>% group_by(NTDID) %>% mutate(Us=cumsum(updates))
gtfs_time<-subset(gtfs_time, gtfs_time$NTDID %in% unique(gtfs_time$NTDID[gtfs_time$Us !=0])) 
gtfs_time$Us <- NULL
#American communities survey IVS from geo spatial analysis
acs=read_excel("gtfs_census_merged.xlsx")
#create a function, since you'll need this for robustness checks later
acs_fun <- function(x){
  x$blocks=x$`Total blocks`
  x$acs_pop=x$`Total values`
  x$age=x$`Median age values`
  x$internet=x$`Internet s values`
  x$income=x$`Median Hou values`
  x$white=x$`White alon values`
  x$black=x$`Black alon values`
  x$employed=x$`Unemployed values`
  x$asian=x$`Asian alon values`
  x$AgencyName <- x$`Agency Name` 
  x$NTDID <- x$`NTD ID`
  x$HH <- x$`Households values`
  #create population percentage specific variables and log income
  x=x %>% select(NTDID, blocks, acs_pop, age, internet, income, white, black, employed, asian)
  x$pc_nonwhite = 1-(x$white/x$acs_pop)
  x$pc_employed =x$employed/x$acs_pop
  x$logHHI =log(x$income)
  x$pc_internet = x$internet/x$acs_pop
  
  
  return(x)
}
acs <- acs_fun(acs)
gtfs19=unique(merge(gtfs_time, acs, by.x="NTDID", by.y="NTDID", all.x=TRUE, all.y=FALSE))
gtfs19 <- subset(gtfs19, gtfs19$yrmo < 2022)
write_xlsx(gtfs19, "gtfs19.xlsx")

##Ridership----
UPT <- read_excel("UPT.xlsx")
UPT$reptype <- UPT$`Reporter Type`
UPT$`Reporter Type` <- NULL
UPT$UZA <- UPT$`UZA Name`
UPT$`UZA Name` <- NULL
UPT <- UPT[grep("CA", UPT$UZA), ]
UPT <- subset(UPT, UPT$Modes == "MB" |UPT$Modes == "RB"| UPT$Modes == "HR" | UPT$Modes == "LR" | UPT$Modes == "TB" | UPT$Modes == "CR" | UPT$Modes == "CB")                 
UPT <- pivot_longer(UPT, cols = -c(Agency, reptype, UZA, Modes, NTDID), names_to = "month",
                    values_to = "UPT")
UPT <- UPT %>% group_by(NTDID, month, Agency) %>% summarize(UPT = sum(na.omit(UPT))) 
#convert dates properly before taking lag
UPT$month <- as.yearmon(UPT$month, format = "%b%y")
UPT$month <- as.Date(UPT$month)
UPT$year <- format(UPT$month, format = "%Y")
UPT$year <- as.numeric(UPT$year)
UPT <- UPT %>% group_by(Agency) %>% 
  mutate(lagUPT = lag(UPT, n =1)) 
UPT <- UPT %>% group_by(Agency) %>%
  mutate(pct_change = (lagUPT/lag(lagUPT) - 1) * 100)
UPT$abspct_change <- abs(UPT$pct_change)
UPT$logpass <- log(UPT$lagUPT)
UPT$logpass <- ifelse(UPT$logpass == "-Inf", NA, UPT$logpass)
UPT$abspct_change <- ifelse(UPT$abspct_change == "Inf", NA, UPT$abspct_change)
UPT$pct_change <- ifelse(UPT$pct_change == "Inf", NA, UPT$pct_change)

#subset to correct time frame
UPT = UPT %>% filter(year > 2012)
UPT$NTDID <- as.character(UPT$NTDID)
UPT$yrmo <- as.yearmon(UPT$month, format = "%b%Y") #get month into same format
UPT$year <- NULL
UPT$Agency <- NULL
UPT$month <- NULL
riders = merge(gtfs, UPT, by.x=c("NTDID", "yrmo"), by.y=c("NTDID", "yrmo"), all.x=FALSE, all.y=FALSE)
#delete variables that arent relevant
riders$NTDID=as.factor(riders$NTDID)
riders <- riders  %>% group_by(NTDID) %>% 
  mutate(lagadopt = lag(adopt, n =2)) #lag 2x because UPT lagged once
riders$yrmo <- str_sub(riders$yrmo, 1, str_length(riders$yrmo)-5)#drop year for month fixed effects
write_xlsx(riders, "riders.xlsx")


##Max and Min---- 
acs_max <- read_excel("gtfs_census_merged_max.xlsx")
acs_min <- read_excel("gtfs_census_merged_min.xlsx")

acs_min$`NTD ID`[acs_min$`NTD ID` == "99422"] <- "90012"
acs_min$`NTD ID`[acs_min$`NTD ID` == "90086"] <- "90031"
acs_min$`NTD ID`[acs_min$`Agency Name` == "Livermore  Amador Valley Transit Authority"] <- "90144"
acs_min$`NTD ID`[acs_min$`Agency Name` == "Stanislaus Regional Transit"] <- "90236"
acs_min$`NTD ID`[acs_min$`Agency Name`=="Golden Gate Bridge, Highway and Transportation District"] <- "90016"

##create a function for renaming things, use for both max and min
acs_max <- acs_fun(acs_max)
acs_min <- acs_fun(acs_min)

gtfsmax=unique(merge(gtfs_time, acs_max, by.x="NTDID", by.y="NTDID", all.x=TRUE, all.y=FALSE))
gtfsmin=unique(merge(gtfs_time, acs_min, by.x="NTDID", by.y="NTDID", all.x=TRUE, all.y=FALSE))
write_xlsx(gtfsmin, "gtfsmin.xlsx")
write_xlsx(gtfsmax, "gtfsmax.xlsx")

#Survey data----


##Clean survey-----
#read in qualtrics file (raw)
surv <- read_excel("survey/2.24TransportationSurvey_October 30, 2023_12.53.xlsx")
surv <- surv[-1, ]
surv$Q67=ifelse(surv$Q67=="Other", surv$Q68, surv$Q67)
#update stanislaus RTA name because they are on our list, just did not find themselves :)
surv$Q67 <- ifelse(surv$Q67 == "Stanislaus Regional Transit Authority", "Stanislaus Regional Transit",surv$Q67)
surv <- subset(surv, !(is.na(surv$Q67)))
surv$indicator <- apply(surv[, 18:51], 1, function(row) any(!is.na(row))) #indicator if they replied to anything at all
# Convert TRUE/FALSE to 1/0
surv <- subset(surv, surv$Q67 !="DOT" & surv$Q67 !="DOT2")
surv$indicator <- as.integer(surv$indicator)
surv <- subset(surv, surv$indicator==1)

#read in original sampling frame to ID agency characteristics
samp_frame <- read.csv("survey/transit_survey.csv")
samp_frame$survey_response <- ifelse(samp_frame$AgencyName %in% surv$Q67, 1, 0)


####Multiple responses-----
# Identify the most complete response for each agency

#Select the most complete response, and if responses are same completion, we select the longer duration
surv$`Duration (in seconds)` <- as.numeric(surv$`Duration (in seconds)`)

#'Q67' is the agency name and Q7_1 to Q8_17 are the categorical question variables
completeness_duration_ranking <- surv %>%
  group_by(ResponseId) %>%
  summarize(
    completeness = sum(!is.na(across(starts_with('Q')))),
    max_duration = max(`Duration (in seconds)`, na.rm = TRUE)
  )

# Initialize completeness values in the original data frame
surv <- left_join(surv, completeness_duration_ranking, by = "ResponseId")

# Sort by completeness and duration descending
surv <- surv %>% arrange(Q67, desc(completeness), desc(max_duration))

# Initialize a new column to store agency completeness rank
surv$agency_rank <- NA

# Assign completeness ranks to each agency
unique_agencies <- unique(surv$Q67)
for (agency in unique_agencies) {
  rank <- seq_len(sum(surv$Q67 == agency))
  surv$agency_rank[surv$Q67 == agency] <- rank
}

surv_filled <- surv %>%
  group_by(Q67) %>%
  arrange(agency_rank) %>%
  fill(starts_with('Q'), .direction = 'up') %>%
  ungroup()

# Select only the columns you need
result_df_filled <- surv_filled %>%
  filter(agency_rank == 1) %>%
  select(Q67, starts_with('Q')) %>%
  distinct()

#let's join orgtype to the survey reults
surv2 <- samp_frame %>% select(AgencyName, OrgType)
result_df_filled = merge(result_df_filled, surv2, by.x="Q67", by.y="AgencyName", all.x=FALSE, all.y=FALSE)


###real time transit data----

table(result_df_filled$Q7_12) #one adopter from MPA
table(result_df_filled$Q8_12)
result_df_filled$RTPD <- ifelse(result_df_filled$Q8_12=="Procuring"|result_df_filled$Q8_12=="Piloting"|result_df_filled$Q8_12=="Using Systemwide"|
                                  result_df_filled$Q7_12=="Yes" , 1,0)
sum(na.omit(result_df_filled$RTPD))

##rationale-----
selected_columns <- c('Q67','RTPD', 'OrgType', 'Q38_1', 'Q38_2', 'Q38_3', 'Q38_4', 'Q38_5', 'Q38_6', 'Q38_7', 'Q38_8', 'Q38_9', 'Q38_10', 'Q38_10_TEXT')

# Subset the DataFrame
subset_df <- result_df_filled[selected_columns]
# Assuming your DataFrame is named result_df_filled

# Define a mapping between column names and reasons
reason_mapping <- c(
  "Q67"="AgencyName",
  "RTPD"="RTPD",
  "OrgType"="OrgType",
  "Q38_1" = "Obsolescence of old technology",
  "Q38_2" = "Workforce expectations",
  "Q38_3" = "Regulatory compliance, state mandates",
  "Q38_4" = "Transparency and public relations",
  "Q38_5" = "Public pressure",
  "Q38_6" = "Operating cost reductions",
  "Q38_7" = "Improve user experience and safety",
  "Q38_8" = "Environmental concerns",
  "Q38_9" = "Expand access to underserved communities",
  "Q38_10" = "Other",
  "Q38_10_TEXT" = "Other - Text"
)

# Rename the columns using the mapping

colnames(subset_df) <- reason_mapping[colnames(subset_df)]

#keep only moderately and very important
subset_df_binary <- subset_df %>%
  mutate_at(
    vars(-c("OrgType", "AgencyName", "RTPD")),
    ~as.numeric(. %in% c("Very important", "Extremely important","Moderately important"))
  )

#subset to only adopters
rtpd <- subset(subset_df_binary, subset_df_binary$RTPD == 1)

#totals for rationale
column_sums <- as.data.frame(colSums(rtpd[, !names(subset_df_binary) %in% c("AgencyName", "OrgType", "RTPD")]))
print(column_sums)
column_sums$percent <- column_sums$`colSums(rtpd[, !names(subset_df_binary) %in% c("AgencyName", "OrgType", "RTPD")])`/length(unique(rtpd$AgencyName))

#add in NTDID
samp_frame2 <- samp_frame %>% select(NTDID, AgencyName)
rtpd <- merge(rtpd, samp_frame2, by.x="AgencyName", by.y="AgencyName", all.x=TRUE, all.y=FALSE)

##covariates----
#double check all NTDIDs are accurate
setdiff(rtpd$NTDID, NTD21$NTDID)
setdiff(rtpd$NTDID, rev21$NTDID)
#read in again for the unlogged covariates...
NTD21=read_excel("2021 Agency Information.xlsx")
NTD21$NTDID <- NTD21$`NTD ID`
NTD21$pop21 <-NTD21$`Service Area Pop`
NTD21$mile21 <-  NTD21$`Service Area Sq Miles`
NTD21$VOMS <- NTD21$`Total VOMS`
NTD21<- NTD21 %>% select(NTDID, pop21, mile21, VOMS)
rtpd <- merge(rtpd, NTD21, by.x="NTDID", by.y="NTDID", all.x=TRUE,all.y=FALSE)
rtpd <- merge(rtpd, rev21, by.x="NTDID", by.y="NTDID", all.x=TRUE,all.y=FALSE)

#tabulate large and small agencies
rtpd$size <- ifelse(rtpd$VOMS > 100 & !(is.na(rtpd$VOMS)), 1, NA)
rtpd$size <- ifelse(rtpd$VOMS < 100 & !(is.na(rtpd$VOMS)), 0, rtpd$size)

write_xlsx(rtpd, "rtpd.xlsx")

#numbers for endnote xxx
rtpd_summ <- rtpd %>% group_by(size) %>% summarize(transp = sum(na.omit(`Transparency and public relations`)), n=length(unique(AgencyName)))
rtpd_summ$pct <- rtpd_summ$transp/rtpd_summ$n


